title: “Case Study 2” author: “Fabio” date: “8/2/2019” output: html_document: toc: TRUE toc_float: collapsed: true smooth_scroll: false theme: paper df_print: paged
We loaded the data
We performed an extensive exploratory analysis of the different variables in the data set in order to predict attrition and monthly income.
We finally after a lot of trial and error choose the Naive bayes model to predict attrition with accuracy od 0,7373, Sensitivity 0.7473, Specificity 0.6857.
The variables that we used for our Naive bayes model were the following : StockOptionLevel, JobLevel, MonthlyIncome, OverTime.
The 3 most important variables that we found in this dataset which are related to attrition are the following : MonthlyIncome, OverTime, StockOptionLevel.
Also the longer that the employee spend in the company or with the current manager, or the longer that the employee are working in general it is less likely to leave their job. Other trend, single employee tend to have more attrition, the further that an employee live from work the more likely of attrition.
We choose the regression model to predict monthly income and it perform with an Adjusted R square of 0.9494, RMSE of 927.0692.
The most inportant variables that we used for the correlation model to predict monthly income were the following : TotalWorkingYears, JobLevel, JobRole.
library(dplyr)
library(ggplot2)
library(tidyr)
library(reshape2)
library(tidyverse)
library(stringr)
library(caret)
library(fpp2)
library(dygraphs)
library(xts)
library(pander)
library(purrr)
library(ggthemes)
library(gridExtra)
library(cowplot)
library(RColorBrewer)
library(gplots)
library(corrplot)
library(functional)
library(fastNaiveBayes)
library(ggpubr)
dfdata<- readr::read_csv("data/CaseStudy2-data.csv")
head(dfdata)
dfdata %>% count(Attrition) ->att
att
You can also embed plots, for example:
agePlot <- ggplot(dfdata,aes(Age,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") #maybe
agePlot
travelPlot <- ggplot(dfdata,aes(BusinessTravel,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")
travelPlot
ratePlot <- ggplot(dfdata,aes(DailyRate, fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium")
ratePlot
depPlot <- ggplot(dfdata,aes(Department,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")
depPlot
distPlot<- ggplot(dfdata,aes(DistanceFromHome,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") # important
distPlot
eduPlot <- ggplot(dfdata,aes(Education,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") #maybe
eduPlot
edufieldPlot <- ggplot(dfdata,aes(EducationField,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# important
edufieldPlot
envPlot <- ggplot(dfdata,aes(EnvironmentSatisfaction,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# maybe
envPlot
genPlot <- ggplot(dfdata,aes(Gender,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") #maybe
genPlot
hourlyPlot <- ggplot(dfdata,aes(HourlyRate,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") #maybe
hourlyPlot
jobInvPlot <- ggplot(dfdata,aes(JobInvolvement,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") #important
jobInvPlot
jobLevelPlot <- ggplot(dfdata,aes(JobLevel,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") # important
jobLevelPlot
jobSatPlot <- ggplot(dfdata,aes(JobSatisfaction,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") # important
jobSatPlot
overTimePlot <- ggplot(dfdata,aes(OverTime,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") # important
overTimePlot
hikePlot <- ggplot(dfdata,aes(PercentSalaryHike, fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium")# important
hikePlot
perfPlot <- ggplot(dfdata,aes(PerformanceRating,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# maybe
perfPlot
RelSatPlot <- ggplot(dfdata,aes(RelationshipSatisfaction,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# important
RelSatPlot
StockPlot <- ggplot(dfdata,aes(StockOptionLevel,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# important
StockPlot
workingYearsPlot <- ggplot(dfdata,aes(TotalWorkingYears,fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium")# important
workingYearsPlot
TrainTimesPlot <- ggplot(dfdata,aes(TrainingTimesLastYear,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")
TrainTimesPlot
WLBPlot<- ggplot(dfdata,aes(WorkLifeBalance,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# important
WLBPlot
marPlot <- ggplot(dfdata,aes(MaritalStatus,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")#maybe
marPlot
monthlyIncPlot <- ggplot(dfdata,aes(MonthlyIncome,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium")# important
monthlyIncPlot
monthlyRatePlot <- ggplot(dfdata,aes(MonthlyRate,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium")
monthlyRatePlot
numCompPlot <- ggplot(dfdata,aes(NumCompaniesWorked,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") # important
numCompPlot
YearAtComPlot <- ggplot(dfdata,aes(YearsAtCompany,fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") # important
YearAtComPlot
YearInCurrPlot <- ggplot(dfdata,aes(YearsInCurrentRole,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")
YearInCurrPlot
YearsSinceProm <- ggplot(dfdata,aes(YearsSinceLastPromotion,fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") #maybe
YearsSinceProm
YearsCurrManPlot <- ggplot(dfdata,aes(YearsWithCurrManager,fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") # important
YearsCurrManPlot
myplot <- ggplot(dfdata, aes(BusinessTravel, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(Department, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(Education, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(EducationField, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(EnvironmentSatisfaction, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot # maybe
myplot <- ggplot(dfdata, aes(Gender, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(JobInvolvement, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(JobLevel, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot # important
myplot <- ggplot(dfdata, aes(JobSatisfaction, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(OverTime, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot # important
myplot <- ggplot(dfdata, aes(RelationshipSatisfaction, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(StockOptionLevel, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot # important
myplot <- ggplot(dfdata, aes(WorkLifeBalance, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(MaritalStatus, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot # maybe
myplot <- ggplot(dfdata, aes(YearsInCurrentRole, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot #maybe
myplot <- ggplot(dfdata, aes(YearsWithCurrManager, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot # important
myplot <- ggplot(dfdata, aes(YearsAtCompany, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot # important/maybe
myplot <- ggplot(dfdata, aes(NumCompaniesWorked, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(TrainingTimesLastYear, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(TotalWorkingYears, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot # maybe
myplot <- ggplot(dfdata, aes(DistanceFromHome, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot # maybe
myplot <- ggplot(dfdata, aes(Age, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(EducationField, group = JobSatisfaction )) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~JobSatisfaction)
myplot
ys <- names(dfdata)[c(2, 5, 7, 14, 20, 21, 22, 25, 30, 31, 33, 34, 35, 36)]
ys %>% map(function(y)
ggplot(dfdata , aes(MonthlyIncome)) + geom_point(aes_string(y=y)))
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
#### More exploratory data for montly income with heatmap for numerical values
dfn <-dfdata[c(2, 5, 7, 14, 20, 21, 22, 25, 30, 31, 33, 34, 35, 36 )]
my_palette <- colorRampPalette(c("red", "white", "black"))
heatmapper <- function(df){
df %>%
keep(is.numeric) %>%
tidyr::drop_na() %>%
cor %>%
heatmap.2(col = my_palette ,
density.info = "none", trace = "none",
dendogram = c("both"), symm = F,
symkey = T, symbreaks = T, scale = "none",
key = T)
}
heatmapper(dfn)
correlator <- function(df){
df %>%
keep(is.numeric) %>%
tidyr::drop_na() %>%
cor %>%
corrplot( addCoef.col = "white", number.digits = 2,
number.cex = 0.5, method="square",
order="hclust", title="Variable Corr Heatmap",
tl.srt=45, tl.cex = 0.8)
}
correlator(dfn)
#### More exploratory data for montly income with ggplot numeric for numerical values
plotAllNumeric <- function(df){
df%>%keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_density()+geom_histogram() + theme_fivethirtyeight()
}
plotAllNumeric(dfn)
dfdata %>% keep(is.factor) %>% names -> label
ggplot(data = dfdata, aes(x = BusinessTravel, y = MonthlyIncome, fill =BusinessTravel )) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
ggplot(data = dfdata, aes(x = Department, y = MonthlyIncome, fill = Department)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
ggplot(data = dfdata, aes(x = as.factor(Education), y = MonthlyIncome, fill = as.factor(Education))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() # important
ggplot(data = dfdata, aes(x = EducationField, y = MonthlyIncome, fill = EducationField)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
ggplot(data = dfdata, aes(x = as.factor(EnvironmentSatisfaction), y = MonthlyIncome, fill = as.factor(EnvironmentSatisfaction))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
ggplot(data = dfdata, aes(x = Gender, y = MonthlyIncome, fill = Gender)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
ggplot(data = dfdata, aes(x = as.factor(JobInvolvement), y = MonthlyIncome, fill = as.factor(JobInvolvement))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
ggplot(data = dfdata, aes(x = as.factor(JobLevel), y = MonthlyIncome, fill = as.factor(JobLevel))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() # important
ggplot(data = dfdata, aes(x = JobRole, y = MonthlyIncome, fill = JobRole)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()# important
ggplot(data = dfdata, aes(x = as.factor(JobSatisfaction), y = MonthlyIncome, fill = as.factor(JobSatisfaction))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
ggplot(data = dfdata, aes(x = MaritalStatus, y = MonthlyIncome, fill = MaritalStatus)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
ggplot(data = dfdata, aes(x = OverTime, y = MonthlyIncome, fill = OverTime)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
ggplot(data = dfdata, aes(x = as.factor(PerformanceRating), y = MonthlyIncome, fill = as.factor(PerformanceRating))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() #maybe
ggplot(data = dfdata, aes(x = as.factor(RelationshipSatisfaction), y = MonthlyIncome, fill = as.factor(RelationshipSatisfaction))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
ggplot(data = dfdata, aes(x = as.factor(StockOptionLevel), y = MonthlyIncome, fill = as.factor(StockOptionLevel))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() # maybe
ggplot(data = dfdata, aes(x = as.factor(WorkLifeBalance), y = MonthlyIncome, fill =as.factor(WorkLifeBalance))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
dfdata$YearsInCurrentRole<- cut(as.numeric(dfdata$YearsInCurrentRole), breaks = c(-1,1,100))
dfdata$TotalWorkingYears<- cut(as.numeric(dfdata$TotalWorkingYears), breaks = c(-1,1,100))
dfdata$YearsWithCurrManager<- cut(as.numeric(dfdata$YearsWithCurrManager), breaks = c(-1,1,100))
dfdata %>% filter(MaritalStatus %in% c("Single", "Divorce"))-> MaritalStatus
dfdata$MaritalStatus <- as.factor(dfdata$MaritalStatus)
levels(dfdata$MaritalStatus) <- c("NotMarried", "Married", "NotMarried")
clasy<- dfdata[c("Attrition", "StockOptionLevel", "JobLevel", "MonthlyIncome", "OverTime")]
clasy[c("Attrition","StockOptionLevel", "JobLevel", "OverTime")] <- lapply(clasy[c("Attrition","StockOptionLevel", "JobLevel", "OverTime")], as.factor)
head(clasy)
set.seed(3033)
split <- function(df, p = 0.75, list = FALSE, ...) {
train_ind <- createDataPartition(df[[1]], p = p, list = list)
cat("creating training dataset...\n")
training <<- df[train_ind, ]
cat("completed training dataset, creating test set\n")
test <<- df[-train_ind, ]
cat("done")
}
split(clasy)
## creating training dataset...
## completed training dataset, creating test set
## done
library(doParallel)
numcores <- parallel::detectCores() - 1
cl <- makePSOCKcluster(numcores)
registerDoParallel(cl)
set.seed(3333)
trainMethod <- trainControl( method = "repeatedcv", number = 25, repeats = 5, summaryFunction = twoClassSummary, classProbs = TRUE)
fit.nb1 <- train(Attrition ~ ., data = training, method = "nb", metric = "Spec", trControl = trainMethod)
fit.nb1
## Naive Bayes
##
## 653 samples
## 4 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (25 fold, repeated 5 times)
## Summary of sample sizes: 627, 626, 627, 628, 626, 626, ...
## Resampling results across tuning parameters:
##
## usekernel ROC Sens Spec
## FALSE 0.7232026 0.7445195 0.6584
## TRUE 0.7668199 0.9992727 0.0000
##
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Spec was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = FALSE
## and adjust = 1.
plot(fit.nb1)
test_pred <- predict(fit.nb1, newdata = test)
test_pred
## [1] No Yes No Yes Yes No No No No Yes No No No No No No Yes
## [18] No Yes No No Yes Yes Yes Yes No No No No No No No Yes No
## [35] No No Yes No No No No No No No No No No No No No No
## [52] No No No No Yes No No Yes No No No Yes No No No Yes No
## [69] No Yes No No No No Yes No Yes Yes No Yes No No No Yes No
## [86] Yes No Yes No Yes No No No Yes No Yes No No No No Yes No
## [103] Yes No Yes No No No Yes No No Yes No No Yes No No No No
## [120] No Yes Yes Yes No No No No Yes No No Yes Yes Yes No Yes Yes
## [137] Yes No No No No No No Yes No No No No Yes Yes Yes Yes No
## [154] No Yes No Yes No No Yes No No No No No No No Yes No No
## [171] Yes Yes No Yes No No No No Yes Yes Yes No Yes No No Yes Yes
## [188] Yes Yes No No No No No No Yes Yes No No Yes No Yes No Yes
## [205] No No Yes No No No Yes No No No No No No
## Levels: No Yes
confusionMatrix(test_pred, test$Attrition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 136 11
## Yes 46 24
##
## Accuracy : 0.7373
## 95% CI : (0.6735, 0.7946)
## No Information Rate : 0.8387
## P-Value [Acc > NIR] : 0.9999
##
## Kappa : 0.3084
##
## Mcnemar's Test P-Value : 6.687e-06
##
## Sensitivity : 0.7473
## Specificity : 0.6857
## Pos Pred Value : 0.9252
## Neg Pred Value : 0.3429
## Prevalence : 0.8387
## Detection Rate : 0.6267
## Detection Prevalence : 0.6774
## Balanced Accuracy : 0.7165
##
## 'Positive' Class : No
##
stopCluster(cl)
dfdata<- readr::read_csv("data/CaseStudy2-data.csv")
reg <- dfdata[c("TotalWorkingYears", "YearsAtCompany", "Age", "Education", "JobLevel", "JobRole", "MonthlyIncome", "PerformanceRating", "StockOptionLevel" )]
reg[c("Education","JobLevel", "JobRole","PerformanceRating", "StockOptionLevel")] <- lapply(reg[c("Education","JobLevel", "JobRole","PerformanceRating", "StockOptionLevel")], as.factor)
reg
set.seed(3033)
split <- function(df, p = 0.75, list = FALSE, ...) {
train_ind <- createDataPartition(df[[1]], p = p, list = list)
cat("creating training dataset...\n")
training <<- df[train_ind, ]
cat("completed training dataset, creating test set\n")
test <<- df[-train_ind, ]
cat("done")
}
split(reg)
## creating training dataset...
## completed training dataset, creating test set
## done
regincome1 <- lm( MonthlyIncome ~ TotalWorkingYears + JobLevel + JobRole, data = training)
summary(regincome1)
##
## Call:
## lm(formula = MonthlyIncome ~ TotalWorkingYears + JobLevel + JobRole,
## data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3085.0 -647.5 -97.3 638.3 4284.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3642.521 220.489 16.520 < 2e-16 ***
## TotalWorkingYears 53.411 9.134 5.847 7.96e-09 ***
## JobLevel2 1598.495 170.893 9.354 < 2e-16 ***
## JobLevel3 4650.058 226.324 20.546 < 2e-16 ***
## JobLevel4 7872.592 345.320 22.798 < 2e-16 ***
## JobLevel5 10669.268 394.495 27.045 < 2e-16 ***
## JobRoleHuman Resources -1242.551 320.507 -3.877 0.000117 ***
## JobRoleLaboratory Technician -1309.554 212.884 -6.151 1.35e-09 ***
## JobRoleManager 3557.429 293.615 12.116 < 2e-16 ***
## JobRoleManufacturing Director 239.931 187.576 1.279 0.201319
## JobRoleResearch Director 3701.321 252.276 14.672 < 2e-16 ***
## JobRoleResearch Scientist -1091.211 216.239 -5.046 5.87e-07 ***
## JobRoleSales Executive 8.403 162.191 0.052 0.958697
## JobRoleSales Representative -1399.737 261.799 -5.347 1.25e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1036 on 641 degrees of freedom
## Multiple R-squared: 0.9504, Adjusted R-squared: 0.9494
## F-statistic: 944 on 13 and 641 DF, p-value: < 2.2e-16
pred <- predict(regincome1, newdata = test)
str(pred)
## Named num [1:215] 9067 9049 2872 9013 5668 ...
## - attr(*, "names")= chr [1:215] "1" "2" "3" "4" ...
ASA2 <- mean((pred[1:nrow(test)] - test$MonthlyIncome)^2)
sqrt(ASA2)
## [1] 927.0692
AIC(regincome1)
## [1] 10970.58
BIC(regincome1)
## [1] 11037.85
pred2 <- predict(regincome1, newdata = test)
pred2
## 1 2 3 4 5 6 7
## 9066.617 9048.732 2871.775 9013.207 5668.301 2560.202 2934.077
## 8 9 10 11 12 13 14
## 5730.115 5908.232 5854.821 5775.123 5783.526 2867.074 9102.143
## 15 16 17 18 19 20 21
## 6103.990 19792.003 8781.679 6068.464 2867.074 2403.017 8835.089
## 22 23 24 25 26 27 28
## 8835.089 9529.428 2386.378 6015.054 13357.364 6317.632 5775.123
## 29 30 31 32 33 34 35
## 16300.988 6148.998 8995.321 5569.883 2818.364 2764.953 6095.587
## 36 37 38 39 40 41 42
## 4465.569 6015.054 5218.019 12858.783 4412.158 2760.253 13517.596
## 43 44 45 46 47 48 49
## 5801.411 2386.378 4683.912 16658.522 2658.132 8781.679 9013.207
## 50 51 52 53 54 55 56
## 19508.610 2867.074 9574.436 2386.378 12698.551 2978.596 18990.843
## 57 58 59 60 61 62 63
## 2818.364 2604.721 13117.433 6015.054 12858.783 6103.990 19151.075
## 64 65 66 67 68 69 70
## 13008.704 2563.249 9120.028 5890.347 2604.721 12848.472 5676.704
## 71 72 73 74 75 76 77
## 2653.431 16071.005 2386.378 5783.526 4321.975 5836.936 2658.132
## 78 79 80 81 82 83 84
## 2813.664 5569.883 4732.622 4518.980 12741.650 2600.021 2871.775
## 85 86 87 88 89 90 91
## 6015.054 5454.659 5164.608 16300.988 9048.732 19471.539 5676.704
## 92 93 94 95 96 97 98
## 12645.140 5801.411 5004.376 5668.301 9529.428 6050.579 2818.364
## 99 100 101 102 103 104 105
## 19188.146 4732.622 2764.953 4683.912 16925.576 5997.168 2706.842
## 106 107 108 109 110 111 112
## 16728.273 8835.089 2973.896 4950.965 2546.610 5775.123 2827.256
## 113 114 115 116 117 118 119
## 2871.775 5783.526 4465.569 2871.775 5676.704 2978.596 2296.195
## 120 121 122 123 124 125 126
## 8986.918 2403.017 2706.842 2600.021 2925.185 5561.480 2871.775
## 127 128 129 130 131 132 133
## 9360.793 2604.721 6228.696 3085.417 8531.936 2720.434 4532.572
## 134 135 136 137 138 139 140
## 2546.610 2706.842 2871.775 5783.526 5997.168 2604.721 5569.883
## 141 142 143 144 145 146 147
## 2706.842 5561.480 5057.787 2776.891 2349.606 16925.576 5730.115
## 148 149 150 151 152 153 154
## 8773.276 7530.725 2760.253 12634.829 9760.956 9048.732 6264.222
## 155 156 157 158 159 160 161
## 2439.789 8781.679 2818.364 12651.169 5801.411 4465.569 8995.321
## 162 163 164 165 166 167 168
## 2493.199 4465.569 17139.219 5569.883 3085.417 2818.364 6442.339
## 169 170 171 172 173 174 175
## 2925.185 4532.572 2711.543 2403.017 4897.555 4683.912 9066.617
## 176 177 178 179 180 181 182
## 5961.643 2760.253 2818.364 2386.378 2453.381 12384.115 5988.765
## 183 184 185 186 187 188 189
## 2764.953 2349.606 16461.220 6264.222 4465.569 3085.417 6050.579
## 190 191 192 193 194 195 196
## 9360.793 2764.953 2773.845 5463.061 15697.130 12651.169 19829.074
## 197 198 199 200 201 202 203
## 3032.007 16194.166 4839.444 9093.740 5569.883 2658.132 2706.842
## 204 205 206 207 208 209 210
## 2439.789 2813.664 19578.360 2332.967 5569.883 4465.569 2386.378
## 211 212 213 214 215
## 5463.061 2867.074 5694.589 13019.015 5836.936
noinc <- read_csv("data/CaseStudy2CompSet No Salary (2).csv")
noinc[c("Education","JobLevel", "JobRole","PerformanceRating", "StockOptionLevel")] <- lapply(noinc[c("Education","JobLevel", "JobRole","PerformanceRating", "StockOptionLevel")], as.factor)
noinc
preds <-predict(regincome1, newdata = noinc)
preddf <- data.frame(predicted = preds, ID = noinc$ID)
preddf <- preddf %>% arrange(ID)
write.csv(preddf, "Case2PredictionsSavorgnanSalary.csv")
library(readr)
noatt <- read_csv("data/CaseStudy2CompSet No Attrition.csv")
noatt[c("StockOptionLevel", "JobLevel", "OverTime")] <- lapply(noatt[c("StockOptionLevel", "JobLevel", "OverTime")], as.factor)
head(noatt)
test_preda <- predict(fit.nb1, newdata = noatt)
test_preda
## [1] No No No No No No No Yes No No Yes Yes No No Yes No No
## [18] No No No Yes No Yes No No No No Yes No Yes Yes No No No
## [35] Yes No No No No No Yes Yes No No Yes No No No Yes No No
## [52] Yes No No No No Yes No No Yes No No Yes No No No No No
## [69] No Yes No No No No No Yes No Yes Yes No No No No No No
## [86] No No No No No No No Yes No No No No No Yes Yes No No
## [103] Yes No No No No No Yes No No No No No No No Yes Yes Yes
## [120] No No Yes No No No No Yes No No No No No No No No No
## [137] No No Yes No No No Yes Yes Yes No No Yes Yes No No No No
## [154] No Yes No Yes Yes Yes No No No Yes No No Yes No No Yes No
## [171] Yes Yes No Yes No No No No No No No No No No No No No
## [188] Yes No No Yes No No No Yes Yes Yes No No No No No Yes No
## [205] No Yes No No No No Yes No Yes No No Yes No No No No Yes
## [222] No No No No Yes Yes No Yes No Yes Yes No Yes Yes Yes Yes No
## [239] No No Yes Yes No Yes No No Yes Yes No Yes Yes No No Yes No
## [256] No No Yes Yes No No No No No No No No No Yes No No No
## [273] No Yes Yes Yes No No No Yes No No No No No Yes Yes Yes Yes
## [290] Yes No No No No Yes Yes No Yes No No
## Levels: No Yes
predi <- data.frame(predicted = test_preda, ID = noatt$ID)
predited <- predi %>% arrange(ID)
write.csv(predited,"Case2PredictionsSavorgnanAttrition.csv")